Biostat 203B Homework 3

Due Feb 23 @ 11:59PM

Author

Hanbei Xiong 605257780

Display machine information for reproducibility:

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Monterey 12.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] htmlwidgets_1.6.2 compiler_4.3.2    fastmap_1.1.1     cli_3.6.1        
 [5] tools_4.3.2       htmltools_0.5.7   rstudioapi_0.15.0 yaml_2.3.7       
 [9] rmarkdown_2.25    knitr_1.45        jsonlite_1.8.7    xfun_0.41        
[13] digest_0.6.33     rlang_1.1.1       evaluate_0.23    

Load necessary libraries (you can add more as needed).

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
library(arrow)
library(memuse)
library(pryr)
library(R.utils)
library(tidyverse)
library(gtsummary)

Display your machine memory.

memuse::Sys.meminfo()
Totalram:  16.000 GiB 
Freeram:    3.485 GiB 

In this exercise, we use tidyverse (ggplot2, dplyr, etc) to explore the MIMIC-IV data introduced in homework 1 and to build a cohort of ICU stays.

Q1. Visualizing patient trajectory

Visualizing a patient’s encounters in a health care system is a common task in clinical data analysis. In this question, we will visualize a patient’s ADT (admission-discharge-transfer) history and ICU vitals in the MIMIC-IV data.

Q1.1 ADT history

A patient’s ADT history records the time of admission, discharge, and transfer in the hospital. This figure shows the ADT history of the patient with subject_id 10001217 in the MIMIC-IV data. The x-axis is the calendar time, and the y-axis is the type of event (ADT, lab, procedure). The color of the line segment represents the care unit. The size of the line segment represents whether the care unit is an ICU/CCU. The crosses represent lab events, and the shape of the dots represents the type of procedure. The title of the figure shows the patient’s demographic information and the subtitle shows top 3 diagnoses.

Do a similar visualization for the patient with subject_id 10013310 using ggplot.

Hint: We need to pull information from data files patients.csv.gz, admissions.csv.gz, transfers.csv.gz, labevents.csv.gz, procedures_icd.csv.gz, diagnoses_icd.csv.gz, d_icd_procedures.csv.gz, and d_icd_diagnoses.csv.gz. For the big file labevents.csv.gz, use the Parquet format you generated in Homework 2. For reproducibility, make the Parquet folder labevents_pq available at the current working directory hw3, for example, by a symbolic link. Make your code reproducible.

Answer:

# loading data
patients <- read_csv("~/mimic/hosp/patients.csv.gz")
admission <- read_csv("~/mimic/hosp/admissions.csv.gz")
transfers <- read_csv("~/mimic/hosp/transfers.csv.gz")
procedures_icd <- read_csv("~/mimic/hosp/procedures_icd.csv.gz")
diagnoses_icd <- read_csv("~/mimic/hosp/diagnoses_icd.csv.gz")
d_icd_procedures <- read_csv("~/mimic/hosp/d_icd_procedures.csv.gz")
d_icd_diagnoses <- read_csv("~/mimic/hosp/d_icd_diagnoses.csv.gz")
labevents <- arrow::open_dataset("./labevents_pq") |>
  collect()
# change subject_id to reproduce result for any patient 
id = 10013310
procedure_info = procedures_icd |> 
  # exclude rows with other subject id
  filter(subject_id == id) |>
  # join table to get the type procedures 
  left_join(d_icd_procedures, by = "icd_code") |>
  # convert date variable to POSIXct format
  mutate(chartdate = as.POSIXct(chartdate, format = "%Y-%m-%d")) |>
  # represent the type of procedures with first 30 characters
  mutate(long_title = str_sub(long_title, 1, 50))


diagnoses_info = diagnoses_icd |> 
  # exclude rows with other subject id
  filter(subject_id == id) |> 
  # join table to get the type diagnoses
  left_join(d_icd_diagnoses, by = c("icd_code", "icd_version")) |>
  # sort in ascending order by hadm id and sequence number
  arrange(hadm_id, seq_num)
# exclude rows with other subject id
patients_filtered <- patients |> filter(subject_id == id)

admission_filtered <- admission |> 
  # exclude rows with other subject id
  filter(subject_id == id) |> 
  # keep one unique row and all variables
  distinct(subject_id, .keep_all = TRUE)

# create a new variable to indicate whether the care unit is an ICU/CCU
transfer_icu <- transfers  |> 
  # exclude rows with other subject id
  filter(subject_id == id) |> 
  # create a new variable to indicate whether the care unit is in ICU/CCU 
  mutate(icu_status = ifelse(str_detect(careunit, "ICU|CCU"), 
                      "ICU/CCU", "non-ICU/CCU")) |>
  # exclude rows with missing value in careunit
  filter(!is.na(careunit))
# store subject_id of the patient
id_char = toString(patients_filtered['subject_id'])
# store gender of the patient
gender_char = toString(patients_filtered['gender'])
# store recorded age of the patient
age_char = toString(patients_filtered['anchor_age'])
# store race of the patient
race_char = toString(admission_filtered['race'])

# store top three most occurrence diagonsis of the patient
diagnoses_info_1 = diagnoses_info['long_title'] |> slice(1)
diagnoses_info_2 = diagnoses_info['long_title'] |> slice(2)
diagnoses_info_3 = diagnoses_info['long_title'] |> slice(3)

# create title and subtitle
title = paste0("Patient ", id_char, ", ", gender_char, ", ", age_char, 
               " years old, ", race_char)
subtitle = paste0(diagnoses_info_1, "\n", diagnoses_info_2, "\n", 
                  diagnoses_info_3)
# data visualization
ggplot() +
  # create a line segment to represent the hospital stay and use 
  # different color of segment to represent stays in different care units
  geom_segment(data = transfer_icu, 
               aes(x = intime, xend = outtime, y = 3, yend = 3,  
                   color = careunit),
               # adjust the width of the line segment of indicate icu stay
               linewidth = ifelse(transfer_icu$icu_status == 'ICU/CCU', 
                                  4, 1)) +
  # create points to represent the charted lab events with the specific 
  # subject id
  geom_point(data = labevents |> filter(subject_id == id), 
             aes(x = charttime, y = 2), shape = 3, size = 1) +
  # create points to represent the charted procedures and use different
  # shape of points to represent different procedures
  geom_point(data = procedure_info, 
             aes(x = chartdate, y = 1, shape = long_title), size = 3) + 
  # rename the legend for procedures and manually assign shapes
  scale_shape_manual(values = c(1:9), name = "Procedure") +
  # rename the legend for care units
  scale_color_discrete(name = "Care Unit") +
  # adjust legend orders and the number of rows and columns
  guides(color = guide_legend(order = 2, nrow = 2, byrow = TRUE),
         shape = guide_legend(order = 1, nrow = 5, byrow = TRUE)) +
  # rename the x and y axis
  xlab('Calender Time') +
  ylab('') +
  # adjust the theme of the legend
  theme(
    legend.text = element_text(size = 6),
    legend.position = "bottom",
    legend.box = "vertical",
    legend.title.align = 0,
    legend.direction = 'horizontal') +
  # assign name to each plot on y axis
  scale_y_continuous(breaks = 1:3, labels = c("Procedure", "Lab", "ADT")) + 
  # assign title and subtitle
  ggtitle(label = title, subtitle = subtitle)

Q1.2 ICU stays

ICU stays are a subset of ADT history. This figure shows the vitals of the patient 10001217 during ICU stays. The x-axis is the calendar time, and the y-axis is the value of the vital. The color of the line represents the type of vital. The facet grid shows the abbreviation of the vital and the stay ID.

Do a similar visualization for the patient 10013310.

Answer:

# manually adjust this subject id to display information for other patient
id = 10013310
# read in vitals information
d_items <- read_csv("~/mimic/icu/d_items.csv.gz")

# read in chartevents information and exclude rows with other subject id
chartevents <- arrow::open_dataset("./chartevents_pq") |> 
  filter(subject_id == id) |>
  collect()

# read in icu stays information and exclude rows with other subject id
icustays <- read_csv("~/mimic/icu/icustays.csv.gz") |>
  filter(subject_id == id)
# filter out the vitals and variables that are not of interest 
d_items <- d_items |> 
  filter(itemid %in% c(220045, 220179, 220180, 223761, 220210)) |>
  select(itemid, abbreviation)
# filter out the vitals that are not of interest and join with d_items to 
# get the abbreviation of vitals
chartevents <- chartevents |> 
  filter(itemid %in% d_items$itemid) |>
  left_join(d_items, by = "itemid")
# plot vitals during each ICU stays
ggplot(chartevents, aes(x = charttime, y = valuenum, color = abbreviation)) +
  geom_point() +
  geom_line() +
  # set x and y scales of each facet to be free
  facet_grid(abbreviation ~stay_id, scales = "free") +
  # turn off legend
  theme(legend.position = "none") +
  # add title
  ggtitle(label = paste0("Patient ", id, " ICU stays - Vitals")) +
  # adjust x axis to display date and time in a more readable format
  scale_x_datetime(guide = guide_axis(n.dodge = 2))

ls -l ~/mimic/icu/
total 6155968
-rw-rw-r--@ 1 hanbeixiong  staff       35893 Jan  5  2023 caregiver.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff  2467761053 Jan  5  2023 chartevents.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff       57476 Jan  5  2023 d_items.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff    45721062 Jan  5  2023 datetimeevents.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff     2614571 Jan  5  2023 icustays.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff   251962313 Jan  5  2023 ingredientevents.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff   324218488 Jan  5  2023 inputevents.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff    38747895 Jan  5  2023 outputevents.csv.gz
-rw-rw-r--@ 1 hanbeixiong  staff    20717852 Jan  5  2023 procedureevents.csv.gz
zcat < ~/mimic/icu/icustays.csv.gz | head
subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.4102662037037037
10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.4975347222222222
10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.1180324074074075
10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.9481134259259258
10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338587962962963
10001884,26184834,37510196,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-11 04:20:05,2131-01-20 08:27:30,9.171817129629629
10002013,23581541,39060235,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2160-05-18 10:00:53,2160-05-19 17:33:33,1.3143518518518518
10002155,20345487,32358465,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-03-09 21:33:00,2131-03-10 18:09:21,0.8585763888888889
10002155,23822395,33685454,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2129-08-04 12:45:00,2129-08-10 17:02:38,6.178912037037037

Q2. ICU stays

icustays.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/icustays/) contains data about Intensive Care Units (ICU) stays. The first 10 lines are

zcat < ~/mimic/icu/icustays.csv.gz | head
subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.4102662037037037
10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.4975347222222222
10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.1180324074074075
10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.9481134259259258
10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338587962962963
10001884,26184834,37510196,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-11 04:20:05,2131-01-20 08:27:30,9.171817129629629
10002013,23581541,39060235,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2160-05-18 10:00:53,2160-05-19 17:33:33,1.3143518518518518
10002155,20345487,32358465,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-03-09 21:33:00,2131-03-10 18:09:21,0.8585763888888889
10002155,23822395,33685454,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2129-08-04 12:45:00,2129-08-10 17:02:38,6.178912037037037

Q2.1 Ingestion

Import icustays.csv.gz as a tibble icustays_tble.

Answer:

# read in icu stays information
icustays_tble <- read_csv("~/mimic/icu/icustays.csv.gz")

Q2.2 Summary and visualization

How many unique subject_id? Can a subject_id have multiple ICU stays? Summarize the number of ICU stays per subject_id by graphs.

Answer: There are 50920 unique subject_id. A subject_id can have multiple ICU stays. The number of ICU stays per subject_id is shown in the following figure.

# summarize the number of unique subject_id
icustays_tble |> 
  select(subject_id) |>
  unique() |>
  count()
# A tibble: 1 × 1
      n
  <int>
1 50920
# summarize the number of ICU stays per subject_id by graphs
icustays_tble |> 
  count(subject_id) |>
  ggplot(aes(x = n)) +
  geom_bar() +
  xlab("Number of ICU stays") +
  ylab("Count") +
  ggtitle(label = "Count of ICU stays for each patient") 

Q3. admissions data

Information of the patients admitted into hospital is available in admissions.csv.gz. See https://mimic.mit.edu/docs/iv/modules/hosp/admissions/ for details of each field in this file. The first 10 lines are

zcat < ~/mimic/hosp/admissions.csv.gz | head
subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0
10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,,EW EMER.,P6957U,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2160-11-20 20:36:00,2160-11-21 03:20:00,0
10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,,EU OBSERVATION,P63AD6,PHYSICIAN REFERRAL,,Medicare,ENGLISH,MARRIED,WHITE,2160-12-27 18:32:00,2160-12-28 16:07:00,0
10000108,27250926,2163-09-27 23:17:00,2163-09-28 09:04:00,,EU OBSERVATION,P38XXV,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2163-09-27 16:18:00,2163-09-28 09:04:00,0
10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,,EU OBSERVATION,P2358X,EMERGENCY ROOM,,Other,ENGLISH,DIVORCED,WHITE,2181-11-14 21:51:00,2181-11-15 09:57:00,0

Q3.1 Ingestion

Import admissions.csv.gz as a tibble admissions_tble.

Answer:

# read in admissions information
admissions_tble <- read_csv("~/mimic/hosp/admissions.csv.gz")

Q3.2 Summary and visualization

Summarize the following information by graphics and explain any patterns you see.

  • number of admissions per patient
  • admission hour (anything unusual?)
  • admission minute (anything unusual?)
  • length of hospital stay (from admission to discharge) (anything unusual?)

According to the MIMIC-IV documentation,

All dates in the database have been shifted to protect patient confidentiality. Dates will be internally consistent for the same patient, but randomly distributed in the future. Dates of birth which occur in the present time are not true dates of birth. Furthermore, dates of birth which occur before the year 1900 occur if the patient is older than 89. In these cases, the patient’s age at their first admission has been fixed to 300.

Answer: The number of admissions per patient is shown in the first figure. It shows strong rightskewness which matches our expectation because majority of people do not need to come to hospital frequently. The admission hour is shown in the following figure. The pattern shows that more people are admitted between 2pm to 12am. There is a peak of admission at 7am which might suggest the time the hospital opens to public in general. The highest peak is at 12am which is unusual since I expect less people to go at midnight. The admission minute is shown in the third figure. The overall pattern is roughly even distributed but there are four unusual peak at 0, 15, 30, 45 minutes. I think there might be rounding at these time points. The length of hospital stay is shown in the last figure. The pattern shows that most people stay in hospital for short period which matches our expectation that most people do not need to stay in hospital for long.

# summarize the number of admissions per patient by bar plot
admissions_tble |> 
  count(subject_id) |>
  ggplot(aes(x = n)) +
  geom_bar() +
  xlab("Number of admissions") +
  ylab("Count") +
  ggtitle("Count of admissions per patient")

# summarize the admission hour by bar plot
admissions_tble |> 
  mutate(hour = hour(admittime)) |>
  ggplot(aes(x = hour)) +
  geom_bar() +
  xlab("Hour") +
  ylab("Count") +
  ggtitle("Admission hour in a day for each admission")

# summarize the admission minute by bar plot
admissions_tble |> 
  mutate(minute = minute(admittime)) |>
  ggplot(aes(x = minute)) +
  geom_bar() +
  xlab("Minute") +
  ylab("Count") +
  ggtitle("Admission minute in an hour for each admission")

# summarize the length of hospital stay by bar plot
admissions_tble |> 
  mutate(days = difftime(dischtime, admittime, units = "days")) |>
  ggplot(aes(x = days)) +
  geom_bar() +
  xlab("length of stay (days)") +
  ylab("Count") +
  ggtitle("Length of hospital stay for each admission")

Q4. patients data

Patient information is available in patients.csv.gz. See https://mimic.mit.edu/docs/iv/modules/hosp/patients/ for details of each field in this file. The first 10 lines are

zcat < ~/mimic/hosp/patients.csv.gz | head
subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
10000032,F,52,2180,2014 - 2016,2180-09-09
10000048,F,23,2126,2008 - 2010,
10000068,F,19,2160,2008 - 2010,
10000084,M,72,2160,2017 - 2019,2161-02-13
10000102,F,27,2136,2008 - 2010,
10000108,M,25,2163,2014 - 2016,
10000115,M,24,2154,2017 - 2019,
10000117,F,48,2174,2008 - 2010,
10000178,F,59,2157,2017 - 2019,

Q4.1 Ingestion

Import patients.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/patients/) as a tibble patients_tble.

Answer:

# read in patients information
patients_tble <- read_csv("~/mimic/hosp/patients.csv.gz")

Q4.2 Summary and visualization

Summarize variables gender and anchor_age by graphics, and explain any patterns you see.

Answer: For variable gender which displays in the first plot, the number of female patients seem to be more than male patients. For variable anchor_age which displays in the second plot, the age of patients is right skewed which means there is higher percentage of younger patients overall. Especially, many patients are under 25 years old.

# summarize gender in bar chart
ggplot(data = patients_tble) +
  geom_bar(aes(x = gender, fill = gender)) +
  ggtitle("Gender count among patients")

# summarize gender in histogram
ggplot(data = patients_tble) +
  geom_histogram(aes(x = anchor_age)) +
  xlab("Age") +
  ggtitle("Count of patients age")

Q5. Lab results

labevents.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/) contains all laboratory measurements for patients. The first 10 lines are

zcat < ~/mimic/hosp/labevents.csv.gz | head
labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,
2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15,ng/mL,30,60,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ACCURATELY.
4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102,IU/L,0,40,abnormal,ROUTINE,
5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,
6,10000032,,52958335,50863,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,109,109,IU/L,35,105,abnormal,ROUTINE,
7,10000032,,52958335,50864,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,___,8,ng/mL,0,8.7,,ROUTINE,MEASURED BY ___.
8,10000032,,52958335,50868,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,12,12,mEq/L,8,20,,ROUTINE,
9,10000032,,52958335,50878,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,143,143,IU/L,0,40,abnormal,ROUTINE,

d_labitems.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/d_labitems/) is the dictionary of lab measurements.

zcat < ~/mimic/hosp/d_labitems.csv.gz | head
itemid,label,fluid,category
50801,Alveolar-arterial Gradient,Blood,Blood Gas
50802,Base Excess,Blood,Blood Gas
50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
50804,Calculated Total CO2,Blood,Blood Gas
50805,Carboxyhemoglobin,Blood,Blood Gas
50806,"Chloride, Whole Blood",Blood,Blood Gas
50808,Free Calcium,Blood,Blood Gas
50809,Glucose,Blood,Blood Gas
50810,"Hematocrit, Calculated",Blood,Blood Gas

We are interested in the lab measurements of creatinine (50912), potassium (50971), sodium (50983), chloride (50902), bicarbonate (50882), hematocrit (51221), white blood cell count (51301), and glucose (50931). Retrieve a subset of labevents.csv.gz that only containing these items for the patients in icustays_tble. Further restrict to the last available measurement (by storetime) before the ICU stay. The final labevents_tble should have one row per ICU stay and columns for each lab measurement.

Hint: Use the Parquet format you generated in Homework 2. For reproducibility, make labevents_pq folder available at the current working directory hw3, for example, by a symbolic link.

Answer:

# clear previous memory usage
rm(list = ls())
# read in lab measurements and remove unnecessary measurements
dlabitems_tble <- read_csv("~/mimic/hosp/d_labitems.csv.gz") |>
  mutate(label = gsub(" ", "_", label)) |>
  filter(itemid %in% c(50912, 50971, 50983, 50902, 50882, 51221, 51301, 50931))
# create a dictionary for itemid and label
labels <- setNames(dlabitems_tble$label, dlabitems_tble$itemid)

Here is the subset that only containing these items for the patients in icustays_tble

# read in icustays 
icustays_tble <- read_csv("~/mimic/icu/icustays.csv.gz")
# read in lab events
labevents_tble_orig <- arrow::open_dataset("./labevents_pq") |>
  # select necessary columns
  dplyr::select(subject_id, itemid, valuenum, storetime) |>
  # filter out unnecessary itemid
  dplyr::filter(itemid %in% dlabitems_tble$itemid) |>
  # filter out unnecessary subject_id
  dplyr::filter(subject_id %in% icustays_tble$subject_id) |>
  # convert to tibble
  collect() |>
  # convert item id to actual labels
  mutate(itemid = labels[as.character(itemid)]) |>
  print(width = Inf)
# A tibble: 13,473,870 × 4
   subject_id itemid            valuenum storetime          
        <int> <chr>                <dbl> <dttm>             
 1   10000032 Bicarbonate           27   2180-03-23 09:40:00
 2   10000032 Chloride             101   2180-03-23 09:40:00
 3   10000032 Creatinine             0.4 2180-03-23 09:40:00
 4   10000032 Potassium              3.7 2180-03-23 09:40:00
 5   10000032 Sodium               136   2180-03-23 09:40:00
 6   10000032 Glucose               95   2180-03-23 08:56:00
 7   10000032 Hematocrit            45.4 2180-03-23 08:19:00
 8   10000032 White_Blood_Cells      3   2180-03-23 08:19:00
 9   10000032 Hematocrit            42.6 2180-05-06 15:42:00
10   10000032 White_Blood_Cells      5   2180-05-06 15:42:00
# ℹ 13,473,860 more rows

Here is the final tibble:

# create labevents_tble 
labevents_tble <- labevents_tble_orig |>
  # join tibble by subject_id
  left_join(
    # select necessary columns in icustay_tble 
    select(icustays_tble, subject_id, stay_id, intime),
    by = "subject_id"
    ) |>
  # filter out storetime that is before intime which is caused by joining 
  # table above. Some people went ICU multiple times
  filter(storetime < intime) |>
  # group by subject_id, stay_id, itemid
  group_by(subject_id, stay_id, itemid) |>
  # within the above group, sort by storetime in ascending order
  arrange(storetime, by_group = TRUE) |>
  # within the above group, slice the last row which is the last 
  # available measurement
  slice_tail(n = 1) |>
  # remove unnecessary columns
  select(-storetime, -intime) |>
  # convert pivot long to pivot wide
  pivot_wider(names_from = itemid, values_from = valuenum) |>
  ungroup() |>
  # change the column names to lower case
  rename_with(tolower) |>
  print(width = Inf)
# A tibble: 68,467 × 10
   subject_id  stay_id bicarbonate chloride creatinine glucose hematocrit
        <dbl>    <dbl>       <dbl>    <dbl>      <dbl>   <dbl>      <dbl>
 1   10000032 39553978          25       95        0.7     102       41.1
 2   10000980 39765666          21      109        2.3      89       27.3
 3   10001217 34592300          30      104        0.5      87       37.4
 4   10001217 37067082          22      108        0.6     112       38.1
 5   10001725 31205490          NA       98       NA        NA       NA  
 6   10001884 37510196          30       88        1.1     141       39.7
 7   10002013 39060235          24      102        0.9     288       34.9
 8   10002155 31090461          23       98        2.8     117       25.5
 9   10002155 32358465          26       85        1.4     133       22.4
10   10002155 33685454          24      105        1.1     138       39.7
   potassium sodium white_blood_cells
       <dbl>  <dbl>             <dbl>
 1       6.7    126               6.9
 2       3.9    144               5.3
 3       4.1    142               5.4
 4       4.2    142              15.7
 5       4.1    139              NA  
 6       4.5    130              12.2
 7       3.5    137               7.2
 8       4.9    135              17.9
 9       5.7    120               9.8
10       4.6    139               7.9
# ℹ 68,457 more rows

Q6. Vitals from charted events

chartevents.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/chartevents/) contains all the charted data available for a patient. During their ICU stay, the primary repository of a patient’s information is their electronic chart. The itemid variable indicates a single measurement type in the database. The value variable is the value measured for itemid. The first 10 lines of chartevents.csv.gz are

zcat < ~/mimic/icu/chartevents.csv.gz | head
subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220179,82,82,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220180,59,59,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220181,63,63,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220045,94,94,bpm,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220179,85,85,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220180,55,55,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220181,62,62,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220210,20,20,insp/min,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220277,95,95,%,0

d_items.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/d_items/) is the dictionary for the itemid in chartevents.csv.gz.

zcat < ~/mimic/icu/d_items.csv.gz | head
itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
220001,Problem List,Problem List,chartevents,General,,Text,,
220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,
220048,Heart Rhythm,Heart Rhythm,chartevents,Routine Vital Signs,,Text,,
220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90,140
220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60,90
220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,

We are interested in the vitals for ICU patients: heart rate (220045), systolic non-invasive blood pressure (220179), diastolic non-invasive blood pressure (220180), body temperature in Fahrenheit (223761), and respiratory rate (220210). Retrieve a subset of chartevents.csv.gz only containing these items for the patients in icustays_tble. Further restrict to the first vital measurement within the ICU stay. The final chartevents_tble should have one row per ICU stay and columns for each vital measurement.

Hint: Use the Parquet format you generated in Homework 2. For reproducibility, make chartevents_pq folder available at the current working directory, for example, by a symbolic link.

Answer:

# read in vitals and remove unnecessary measurements
ditems_tble <- read_csv("~/mimic/icu/d_items.csv.gz") |>
  # replace space in label with underscore
  mutate(label = gsub(" ", "_", label)) |>
  # filter out unnecessary itemid
  filter(itemid %in% c(220045, 220179, 220180, 223761, 220210))
# create a dictionary for id of vital and vital
labels <- setNames(ditems_tble$label, ditems_tble$itemid)

Here is the subset of the vitals for the patients in icustays_tble:

# read in icustays
icustays_tble <- read_csv("~/mimic/icu/icustays.csv.gz")
# read in chart events
chartevents_tble_orig <- arrow::open_dataset("./chartevents_pq") |>
  # select necessary columns
  dplyr::select(subject_id, stay_id, itemid, valuenum, charttime) |>
  # filter out unnecessary itemid
  dplyr::filter(itemid %in% ditems_tble$itemid) |>
  # filter out unnecessary subject_id
  dplyr::filter(subject_id %in% icustays_tble$subject_id) |>
  # convert to tibble
  collect() |>
  # convert item id to actual labels (vitals)
  mutate(itemid = labels[as.character(itemid)]) |>
  print(width = Inf)
# A tibble: 22,504,119 × 5
   subject_id  stay_id itemid                                valuenum
        <int>    <int> <chr>                                    <dbl>
 1   10001884 37510196 Non_Invasive_Blood_Pressure_systolic       149
 2   10001884 37510196 Non_Invasive_Blood_Pressure_diastolic       97
 3   10001884 37510196 Heart_Rate                                  78
 4   10001884 37510196 Respiratory_Rate                            18
 5   10001884 37510196 Non_Invasive_Blood_Pressure_systolic       145
 6   10001884 37510196 Non_Invasive_Blood_Pressure_diastolic       80
 7   10001884 37510196 Heart_Rate                                  82
 8   10001884 37510196 Respiratory_Rate                            28
 9   10001884 37510196 Non_Invasive_Blood_Pressure_systolic       144
10   10001884 37510196 Non_Invasive_Blood_Pressure_diastolic       83
   charttime          
   <dttm>             
 1 2131-01-12 12:01:00
 2 2131-01-12 12:01:00
 3 2131-01-12 13:00:00
 4 2131-01-12 13:00:00
 5 2131-01-12 13:01:00
 6 2131-01-12 13:01:00
 7 2131-01-12 14:00:00
 8 2131-01-12 14:00:00
 9 2131-01-12 14:01:00
10 2131-01-12 14:01:00
# ℹ 22,504,109 more rows

Here is the final tibble:

# create chartevents_tble
charevents_tble <- chartevents_tble_orig |>
  # join tibble by stay_id
  left_join(
    # select necessary columns in icustay_tble
    select(icustays_tble, stay_id, intime, outtime),
    by = "stay_id"
    ) |>
  # filter out charttime that is during icu stay
  filter(charttime >= intime, charttime <= outtime) |>
  # group by stay_id, itemid
  group_by(stay_id, itemid) |>
  # within the above group, sort by charttime in ascending order
  arrange(charttime, by_group = TRUE) |>
  # within the above group, slice the first row which is the first vital
  slice_head(n = 1) |>
  # remove unnecessary columns
  select(-charttime, -intime, -outtime) |>
  # convert pivot long to pivot wide
  pivot_wider(names_from = itemid, values_from = valuenum) |>
  ungroup() |>
  # change the column names to lower case
  rename_with(tolower) |>
  # sort by subject_id, stay_id in ascending order
  arrange(subject_id, stay_id) |>
  print(width = Inf)
# A tibble: 73,164 × 7
   subject_id  stay_id heart_rate non_invasive_blood_pressure_diastolic
        <int>    <dbl>      <dbl>                                 <dbl>
 1   10000032 39553978         91                                    48
 2   10000980 39765666         77                                    77
 3   10001217 34592300         96                                    95
 4   10001217 37067082         86                                    90
 5   10001725 31205490         55                                    56
 6   10001884 37510196         38                                    12
 7   10002013 39060235         80                                    70
 8   10002155 31090461         94                                    51
 9   10002155 32358465         98                                    65
10   10002155 33685454         68                                    61
   non_invasive_blood_pressure_systolic respiratory_rate temperature_fahrenheit
                                  <dbl>            <dbl>                  <dbl>
 1                                   84               24                   98.7
 2                                  150               23                   98  
 3                                  167               11                   97.6
 4                                  151               18                   98.5
 5                                   73               19                   97.7
 6                                  180               10                   98.1
 7                                  104               14                   97.2
 8                                  118               18                   96.9
 9                                  109               23                   97.7
10                                  126               18                   95.9
# ℹ 73,154 more rows

Q7. Putting things together

Let us create a tibble mimic_icu_cohort for all ICU stays, where rows are all ICU stays of adults (age at intime >= 18) and columns contain at least following variables

  • all variables in icustays_tble
  • all variables in admissions_tble
  • all variables in patients_tble
  • the last lab measurements before the ICU stay in labevents_tble
  • the first vital measurements during the ICU stay in chartevents_tble

The final mimic_icu_cohort should have one row per ICU stay and columns for each variable.

Answer:

# read in necessary data as tibbles
patients_tble <- read_csv("~/mimic/hosp/patients.csv.gz")
admissions_tble <- read_csv("~/mimic/hosp/admissions.csv.gz")
icustays_tble <- read_csv("~/mimic/icu/icustays.csv.gz")
# join all the necessary tibbles
mimic_icu_cohort <- icustays_tble |>
  left_join(admissions_tble, by = c("subject_id", "hadm_id")) |>
  left_join(patients_tble, by = "subject_id") |>
  left_join(labevents_tble, by = c("subject_id", "stay_id")) |>
  left_join(charevents_tble, by = c("subject_id", "stay_id")) |>
  # create a new variable age_intime which represents the age at intime
  mutate(age_intime = anchor_age + (year(intime) - anchor_year)) |>
  # filter out age at in time greater 18
  filter(age_intime >= 18) |>
  print(width = Inf)
# A tibble: 73,181 × 41
   subject_id  hadm_id  stay_id first_careunit                                  
        <dbl>    <dbl>    <dbl> <chr>                                           
 1   10000032 29079034 39553978 Medical Intensive Care Unit (MICU)              
 2   10000980 26913865 39765666 Medical Intensive Care Unit (MICU)              
 3   10001217 24597018 37067082 Surgical Intensive Care Unit (SICU)             
 4   10001217 27703517 34592300 Surgical Intensive Care Unit (SICU)             
 5   10001725 25563031 31205490 Medical/Surgical Intensive Care Unit (MICU/SICU)
 6   10001884 26184834 37510196 Medical Intensive Care Unit (MICU)              
 7   10002013 23581541 39060235 Cardiac Vascular Intensive Care Unit (CVICU)    
 8   10002155 20345487 32358465 Medical Intensive Care Unit (MICU)              
 9   10002155 23822395 33685454 Coronary Care Unit (CCU)                        
10   10002155 28994087 31090461 Medical/Surgical Intensive Care Unit (MICU/SICU)
   last_careunit                                    intime             
   <chr>                                            <dttm>             
 1 Medical Intensive Care Unit (MICU)               2180-07-23 14:00:00
 2 Medical Intensive Care Unit (MICU)               2189-06-27 08:42:00
 3 Surgical Intensive Care Unit (SICU)              2157-11-20 19:18:02
 4 Surgical Intensive Care Unit (SICU)              2157-12-19 15:42:24
 5 Medical/Surgical Intensive Care Unit (MICU/SICU) 2110-04-11 15:52:22
 6 Medical Intensive Care Unit (MICU)               2131-01-11 04:20:05
 7 Cardiac Vascular Intensive Care Unit (CVICU)     2160-05-18 10:00:53
 8 Medical Intensive Care Unit (MICU)               2131-03-09 21:33:00
 9 Coronary Care Unit (CCU)                         2129-08-04 12:45:00
10 Medical/Surgical Intensive Care Unit (MICU/SICU) 2130-09-24 00:50:00
   outtime               los admittime           dischtime          
   <dttm>              <dbl> <dttm>              <dttm>             
 1 2180-07-23 23:50:47 0.410 2180-07-23 12:35:00 2180-07-25 17:55:00
 2 2189-06-27 20:38:27 0.498 2189-06-27 07:38:00 2189-07-03 03:00:00
 3 2157-11-21 22:08:00 1.12  2157-11-18 22:56:00 2157-11-25 18:00:00
 4 2157-12-20 14:27:41 0.948 2157-12-18 16:58:00 2157-12-24 14:55:00
 5 2110-04-12 23:59:56 1.34  2110-04-11 15:08:00 2110-04-14 15:00:00
 6 2131-01-20 08:27:30 9.17  2131-01-07 20:39:00 2131-01-20 05:15:00
 7 2160-05-19 17:33:33 1.31  2160-05-18 07:45:00 2160-05-23 13:30:00
 8 2131-03-10 18:09:21 0.859 2131-03-09 20:33:00 2131-03-10 01:55:00
 9 2129-08-10 17:02:38 6.18  2129-08-04 12:44:00 2129-08-18 16:53:00
10 2130-09-27 22:13:41 3.89  2130-09-23 21:59:00 2130-09-29 18:55:00
   deathtime           admission_type              admit_provider_id
   <dttm>              <chr>                       <chr>            
 1 NA                  EW EMER.                    P30KEH           
 2 NA                  EW EMER.                    P30KEH           
 3 NA                  EW EMER.                    P4645A           
 4 NA                  DIRECT EMER.                P99698           
 5 NA                  EW EMER.                    P35SU0           
 6 2131-01-20 05:15:00 OBSERVATION ADMIT           P874LG           
 7 NA                  SURGICAL SAME DAY ADMISSION P47E1G           
 8 2131-03-10 21:53:00 EW EMER.                    P80515           
 9 NA                  EW EMER.                    P05HUO           
10 NA                  EW EMER.                    P3529J           
   admission_location discharge_location           insurance language
   <chr>              <chr>                        <chr>     <chr>   
 1 EMERGENCY ROOM     HOME                         Medicaid  ENGLISH 
 2 EMERGENCY ROOM     HOME HEALTH CARE             Medicare  ENGLISH 
 3 EMERGENCY ROOM     HOME HEALTH CARE             Other     ?       
 4 PHYSICIAN REFERRAL HOME HEALTH CARE             Other     ?       
 5 PACU               HOME                         Other     ENGLISH 
 6 EMERGENCY ROOM     DIED                         Medicare  ENGLISH 
 7 PHYSICIAN REFERRAL HOME HEALTH CARE             Medicare  ENGLISH 
 8 EMERGENCY ROOM     DIED                         Other     ENGLISH 
 9 PROCEDURE SITE     CHRONIC/LONG TERM ACUTE CARE Other     ENGLISH 
10 EMERGENCY ROOM     HOME HEALTH CARE             Other     ENGLISH 
   marital_status race                   edregtime           edouttime          
   <chr>          <chr>                  <dttm>              <dttm>             
 1 WIDOWED        WHITE                  2180-07-23 05:54:00 2180-07-23 14:00:00
 2 MARRIED        BLACK/AFRICAN AMERICAN 2189-06-27 06:25:00 2189-06-27 08:42:00
 3 MARRIED        WHITE                  2157-11-18 17:38:00 2157-11-19 01:24:00
 4 MARRIED        WHITE                  NA                  NA                 
 5 MARRIED        WHITE                  NA                  NA                 
 6 MARRIED        BLACK/AFRICAN AMERICAN 2131-01-07 13:36:00 2131-01-07 22:13:00
 7 SINGLE         OTHER                  NA                  NA                 
 8 MARRIED        WHITE                  2131-03-09 19:14:00 2131-03-09 21:33:00
 9 MARRIED        WHITE                  2129-08-04 11:00:00 2129-08-04 12:35:00
10 MARRIED        WHITE                  2130-09-23 19:59:00 2130-09-24 00:50:00
   hospital_expire_flag gender anchor_age anchor_year anchor_year_group
                  <dbl> <chr>       <dbl>       <dbl> <chr>            
 1                    0 F              52        2180 2014 - 2016      
 2                    0 F              73        2186 2008 - 2010      
 3                    0 F              55        2157 2011 - 2013      
 4                    0 F              55        2157 2011 - 2013      
 5                    0 F              46        2110 2011 - 2013      
 6                    1 F              68        2122 2008 - 2010      
 7                    0 F              53        2156 2008 - 2010      
 8                    1 F              80        2128 2008 - 2010      
 9                    0 F              80        2128 2008 - 2010      
10                    0 F              80        2128 2008 - 2010      
   dod        bicarbonate chloride creatinine glucose hematocrit potassium
   <date>           <dbl>    <dbl>      <dbl>   <dbl>      <dbl>     <dbl>
 1 2180-09-09          25       95        0.7     102       41.1       6.7
 2 2193-08-26          21      109        2.3      89       27.3       3.9
 3 NA                  22      108        0.6     112       38.1       4.2
 4 NA                  30      104        0.5      87       37.4       4.1
 5 NA                  NA       98       NA        NA       NA         4.1
 6 2131-01-20          30       88        1.1     141       39.7       4.5
 7 NA                  24      102        0.9     288       34.9       3.5
 8 2131-03-10          26       85        1.4     133       22.4       5.7
 9 2131-03-10          24      105        1.1     138       39.7       4.6
10 2131-03-10          23       98        2.8     117       25.5       4.9
   sodium white_blood_cells heart_rate non_invasive_blood_pressure_diastolic
    <dbl>             <dbl>      <dbl>                                 <dbl>
 1    126               6.9         91                                    48
 2    144               5.3         77                                    77
 3    142              15.7         86                                    90
 4    142               5.4         96                                    95
 5    139              NA           55                                    56
 6    130              12.2         38                                    12
 7    137               7.2         80                                    70
 8    120               9.8         98                                    65
 9    139               7.9         68                                    61
10    135              17.9         94                                    51
   non_invasive_blood_pressure_systolic respiratory_rate temperature_fahrenheit
                                  <dbl>            <dbl>                  <dbl>
 1                                   84               24                   98.7
 2                                  150               23                   98  
 3                                  151               18                   98.5
 4                                  167               11                   97.6
 5                                   73               19                   97.7
 6                                  180               10                   98.1
 7                                  104               14                   97.2
 8                                  109               23                   97.7
 9                                  126               18                   95.9
10                                  118               18                   96.9
   age_intime
        <dbl>
 1         52
 2         76
 3         55
 4         55
 5         46
 6         77
 7         57
 8         83
 9         81
10         82
# ℹ 73,171 more rows

Q8. Exploratory data analysis (EDA)

Summarize the following information about the ICU stay cohort mimic_icu_cohort using appropriate numerics or graphs:

  • Length of ICU stay los vs demographic variables (race, insurance, marital_status, gender, age at intime)

Answer:

Here are graph and description of the graph.

In this plot, we can see people who have medicare insurance has higher percentage in older age. The length of stay of older people might be longer among people with different insurance.

# plot with jitter point plot to add randomness to the discrete points
ggplot(data = mimic_icu_cohort) + 
  geom_jitter(mapping = aes(x = age_intime, y = los, color = insurance)) +
  xlab("Age at intime") +
  ylab("Length of ICU stay")

In this plot, we can see people who are married and single have longer ICU stay. The distribution of female and male seem to be similar in each marital status.

# plot with bar chart
ggplot(data = mimic_icu_cohort) + 
  geom_jitter(mapping = aes(x = marital_status, y = los, color = gender)) +
  xlab("Marital status") +
  ylab("Length of ICU stay")

# Combine groups in race based on human sense
mimic_icu_cohort <- mimic_icu_cohort |> 
  mutate(race = as.factor(race)) |> # Ensure race is a factor
  mutate(race = fct_collapse(
    race,
    ASIAN = str_subset(race, "ASIAN"),
    BLACK = str_subset(race, "BLACK"),
    HISPANIC = str_subset(race, "HISPANIC"),
    WHITE = str_subset(race, "WHITE"),
    UNKNOWN = c("UNABLE TO OBTAIN", "PATIENT DECLINED TO ANSWER"),
    OTHER = c("AMERICAN INDIAN/ALASKA NATIVE", "MULTIPLE RACE/ETHNICITY", 
              "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER", "PORTUGUESE", 
              "SOUTH AMERICAN")
  ))

In this plot, we can observe many outliers in the length of stay for different races. The general distribution of length of stay for each race seems similar.

# plot with box plot
ggplot(data = mimic_icu_cohort) + 
  geom_boxplot(mapping = aes(x = race, y = los, fill = race)) +
  theme(legend.position = "none") +
  coord_flip() +
  ylab('Length of Stay')

  • Length of ICU stay los vs the last available lab measurements before ICU stay
# read in data and only keep the necessary lab measurements
dlabitems_tble <- read_csv("~/mimic/hosp/d_labitems.csv.gz") |>
  mutate(label = gsub(" ", "_", label)) |>
  filter(itemid %in% c(50912, 50971, 50983, 50902, 50882, 51221, 51301, 50931))
# create a dictionary for the lab measurements
labels_lab <- setNames(dlabitems_tble$label, dlabitems_tble$itemid)

Most lab measurements show a normal distribution which is good. Some particular lab measurements like “creatinine”, “glucose” and “white blood cells” show a right skewed pattern which might indicate that lower measure of these indicators will associtate with longer length of stay.

# plot for each lab measurement
for (name in names(labels_lab)) { 
  lab_measure = tolower(labels_lab[name])
  plot <- ggplot(data = mimic_icu_cohort) +
    # !!sym() convert the string to read as a variable
    geom_point(mapping = aes(x = !!sym(lab_measure), y = los)) +
    ylab('Length of Stay')
  print(plot)
}

  • Length of ICU stay los vs the first vital measurements within the ICU stay
# read in data and only keep the necessary vital measurements
ditems_tble <- read_csv("~/mimic/icu/d_items.csv.gz") |>
  mutate(label = gsub(" ", "_", label)) |>
  filter(itemid %in% c(220045, 220179, 220180, 223761, 220210))
# create a dictionary for the vital measurements
labels_vitals <- setNames(ditems_tble$label, ditems_tble$itemid)

Here is the plot for each vital measurement. We can see that the distribution are normal after we filter the outliers.

# plot for each vital measurement with filtering outliers
for (name in names(labels_vitals)) { 
  vital <- tolower(labels_vitals[name])
  
  # Calculate IQR and filter outliers
  bounds <- mimic_icu_cohort |>
    summarize(
      Q1 = quantile(!!sym(vital), 0.1, na.rm = TRUE),
      Q3 = quantile(!!sym(vital), 0.9, na.rm = TRUE)
    ) |>
    mutate(
      lower = Q1 - 1.5 * (Q3 - Q1),
      upper = Q3 + 1.5 * (Q3 - Q1)
    )
  # filtering outliers
  filtered_data <- mimic_icu_cohort |>
    filter(!!sym(vital) >= bounds$lower, !!sym(vital) <= bounds$upper)
  
  # Plot
  plot <- ggplot(data = filtered_data, aes(x = !!sym(vital), y = los)) +
    geom_point() +
    ylab('Length of Stay') +
    ggtitle(paste("Plot of", vital, "vs Length of Stay without outliers"))

  print(plot)
}

  • Length of ICU stay los vs first ICU unit

In this plot, we can see patients in Neuro SICU has longer ICU stays than the rest of care units.

# plot with box plot
ggplot(data = mimic_icu_cohort) + 
  geom_boxplot(mapping = aes(x = first_careunit, y = los, 
                             fill = first_careunit)) +
  theme(legend.position = "bottom") +
  guides(fill = guide_legend(nrow = 5, ncol = 2)) +
  coord_flip() +
  xlab('') +
  ylab('Length of Stay') +
  labs(title = "Length of ICU stay versus first ICU unit", fill = "ICU Unit")